Our goal here is to go over designing guides for a tiling screen on positive genes. We’ll design CRISPR guide libraries for the top genes in the screen. First we’ll go over how to do one gene and then extend the method to other genes.
Several studies have shown improved interference and activation with FANTOM5 annotated TSS’s. We shall do the same. TSS information was downloaded from http://biomart.gsc.riken.jp/ and lifted over to mm10 using the UCSC liftover tool (https://genome.ucsc.edu/cgi-bin/hgLiftOver).
setwd("~/sgRNA/sgRNAdesign/")
genes = scan("~/sgRNA/Yanxia/genes4tilingscreen.txt", what = character())
FANTOM5mm10TSSannotation = read.table(file = "~/sgRNA/sgRNAdesign/mm10TSSliftover.bed")
head(FANTOM5mm10TSSannotation)
## V1 V2 V3 V4 V5 V6 V7
## 1 chr10 100589202 100589203 p2@4930430F08Rik,0.3007 55 - 100589202
## 2 chr10 100589205 100589269 p1@4930430F08Rik,0.3053 0 - 100589205
## 3 chr10 100592365 100592382 p1@1700017N19Rik,0.1939 -3 + 100592365
## 4 chr10 100592386 100592406 p2@1700017N19Rik,0.1988 0 + 100592386
## 5 chr10 10343097 10343101 p1@ENSMUST00000118589,0.0800 -13 + 10343097
## 6 chr10 101681605 101681621 p12@Mgat4c,0.3624 -19 + 101681605
## V8 V9
## 1 100589203 60,179,113
## 2 100589269 60,179,113
## 3 100592382 211,211,211
## 4 100592406 211,211,211
## 5 10343101 211,211,211
## 6 101681621 60,179,113
Top2TSSs = c()
for(gene in genes){
Top2TSSs = rbind(Top2TSSs, FANTOM5mm10TSSannotation[grep(paste0("p1@", gene, ","), FANTOM5mm10TSSannotation[,4]),])
Top2TSSs = rbind(Top2TSSs, FANTOM5mm10TSSannotation[grep(paste0("p2@", gene, ","), FANTOM5mm10TSSannotation[,4]),])
}
Top2TSSs
## V1 V2 V3 V4 V5 V6
## 23490 chr15 54745902 54745916 p1@Nov,0.2919 -11 +
## 23489 chr15 54745870 54745881 p2@Nov,0.2772 -46 +
## 15554 chr12 86421628 86421662 p1@Esrrb,0.5901 0 +
## 15569 chr12 86470131 86470150 p2@Esrrb,0.3256 14 +
## 26717 chr16 23988625 23988641 p1@Bcl6,0.3561 -13 -
## 26716 chr16 23988600 23988607 p2@Bcl6,0.3050 4 -
## 138196 chr7 30636514 30636529 p1@Etv2,0.1369 -225 -
## 11368 chr11 84525514 84525542 p1@Lhx1,0.2494 0 -
## 101016 chr11 84525894 84525930 p2@Lhx1,0.2707 -360 -
## 2093 chr10 57486354 57486414 p1@Hsf2,0.4737 0 +
## 2094 chr10 57486418 57486454 p2@Hsf2,0.5200 33 +
## 54772 chr3 34649987 34650005 p1@Sox2,0.4826 0 +
## 54773 chr3 34650024 34650050 p2@Sox2,0.4802 -4 +
## 89595 chr9 55541190 55541209 p1@Isl2,0.4408 0 +
## 89593 chr9 55538635 55538676 p2@Isl2,0.1039 0 +
## 3715 chr10 81559447 81559498 p1@Aes,0.4399 0 +
## 3716 chr10 81559524 81559583 p2@Aes,0.4529 0 +
## 1511 chr10 30842765 30842791 p1@Hey2,0.5586 0 -
## 1513 chr10 30842811 30842831 p2@Hey2,0.5308 -10 -
## 72993 chr6 43309549 43309552 p1@Nobox,0.2041 1 -
## 83229 chr8 12395603 12395621 p1@Sox1,0.2398 84 +
## 140181 chr8 12395874 12395885 p2@Sox1,0.1326 355 +
## 113450 chr18 11052644 11052661 p1@Gata6,0.4626 134 +
## 33421 chr18 11052487 11052510 p2@Gata6,0.5117 0 +
## 29910 chr17 27728937 27728956 p1@Spdef,0.1132 0 -
## 111472 chr17 27729074 27729086 p2@Spdef,0.1740 -123 -
## 22639 chr15 102954474 102954495 p1@Hoxc11,0.4722 -30 +
## 22640 chr15 102954510 102954521 p2@Hoxc11,0.4106 -4 +
## 31078 chr17 35506052 35506069 p1@Pou5f1,0.3172 13 +
## 31077 chr17 35506028 35506049 p2@Pou5f1,0.3152 0 +
## 65220 chr5 123394782 123394826 p1@Mlxip,0.3923 0 +
## 65219 chr5 123394770 123394781 p2@Mlxip,0.3918 -16 +
## 73754 chr6 64729118 64729132 p1@Atoh1,0.3346 -13 +
## 73756 chr6 64729241 64729254 p2@Atoh1,0.3880 95 +
## 31893 chr17 47737036 47737107 p1@Tfeb,0.3986 0 +
## 112581 chr17 47737174 47737207 p2@Tfeb,0.3822 137 +
## 94378 chrX 7967817 7967875 p1@Gata1,0.5548 -2 -
## 94377 chrX 7967802 7967812 p2@Gata1,0.4441 2 -
## 40561 chr1 167689552 167689563 p1@Lmx1a,0.5086 0 +
## 40562 chr1 167689565 167689577 p2@Lmx1a,0.5453 7 +
## 22284 chr14 99298652 99298715 p1@Klf5,0.3062 0 +
## 107327 chr14 99298945 99298989 p2@Klf5,0.1920 254 +
## 21407 chr14 63245219 63245265 p1@Gata4,0.3098 0 -
## 21412 chr14 63271654 63271679 p2@Gata4,0.5574 12 -
## 85405 chr8 72319053 72319071 p1@Klf2,0.2649 0 +
## 141319 chr8 72318893 72318915 p2@Klf2,0.2366 -117 +
## 38846 chr1 118627943 118627951 p1@Tfcp2l1,0.3550 0 +
## 38847 chr1 118627986 118628014 p2@Tfcp2l1,0.4221 41 +
## 62572 chr4 55532465 55532485 p1@Klf4,0.3020 0 -
## 129031 chr4 55532179 55532238 p2@Klf4,0.1941 214 -
## 70482 chr6 122707592 122707608 p1@Nanog,p1@Nanogpd,0.2468 0 +
## 70483 chr6 122707646 122707655 p2@Nanog,p2@Nanogpd,0.2636 53 +
## V7 V8 V9
## 23490 54745902 54745916 60,179,113
## 23489 54745870 54745881 60,179,113
## 15554 86421628 86421662 60,179,113
## 15569 86470131 86470150 60,179,113
## 26717 23988625 23988641 60,179,113
## 26716 23988600 23988607 60,179,113
## 138196 30636514 30636529 211,211,211
## 11368 84525514 84525542 60,179,113
## 101016 84525894 84525930 60,179,113
## 2093 57486354 57486414 60,179,113
## 2094 57486418 57486454 60,179,113
## 54772 34649987 34650005 60,179,113
## 54773 34650024 34650050 60,179,113
## 89595 55541190 55541209 60,179,113
## 89593 55538635 55538676 211,211,211
## 3715 81559447 81559498 60,179,113
## 3716 81559524 81559583 60,179,113
## 1511 30842765 30842791 60,179,113
## 1513 30842811 30842831 60,179,113
## 72993 43309549 43309552 211,211,211
## 83229 12395603 12395621 60,179,113
## 140181 12395874 12395885 211,211,211
## 113450 11052644 11052661 60,179,113
## 33421 11052487 11052510 60,179,113
## 29910 27728937 27728956 211,211,211
## 111472 27729074 27729086 211,211,211
## 22639 102954474 102954495 60,179,113
## 22640 102954510 102954521 60,179,113
## 31078 35506052 35506069 60,179,113
## 31077 35506028 35506049 60,179,113
## 65220 123394782 123394826 60,179,113
## 65219 123394770 123394781 60,179,113
## 73754 64729118 64729132 60,179,113
## 73756 64729241 64729254 60,179,113
## 31893 47737036 47737107 60,179,113
## 112581 47737174 47737207 60,179,113
## 94378 7967817 7967875 60,179,113
## 94377 7967802 7967812 60,179,113
## 40561 167689552 167689563 60,179,113
## 40562 167689565 167689577 60,179,113
## 22284 99298652 99298715 60,179,113
## 107327 99298945 99298989 211,211,211
## 21407 63245219 63245265 60,179,113
## 21412 63271654 63271679 60,179,113
## 85405 72319053 72319071 60,179,113
## 141319 72318893 72318915 60,179,113
## 38846 118627943 118627951 60,179,113
## 38847 118627986 118628014 60,179,113
## 62572 55532465 55532485 60,179,113
## 129031 55532179 55532238 211,211,211
## 70482 122707592 122707608 60,179,113
## 70483 122707646 122707655 60,179,113
Top2TSSs = data.frame( chr = Top2TSSs[,1], TSS = apply(Top2TSSs[,c(2, 3, 6)], 1, function(x) if(x[3] == "+"){return(x[1])} else{return(x[2])}), strand = sapply(Top2TSSs[,6], function(x) if(x == "+"){return(1)} else{ return(-1)} ),
gene = sapply(Top2TSSs[,4], function(x) sub(".*@", "", gsub("\\,.*","", x))))
Top2TSSs$TSS = as.numeric(levels(Top2TSSs$TSS))[Top2TSSs$TSS]
chr = Top2TSSs$chr[-which(duplicated(Top2TSSs$gene))]
strand = Top2TSSs$strand[-which(duplicated(Top2TSSs$gene))]
genes = Top2TSSs$gene[-which(duplicated(Top2TSSs$gene))]
start = rep(0, times = length(genes))
end = rep(0, times = length(genes))
l = 35000
for(i in 1:length(genes)){
if(strand[i] == 1){
start[i] = max(Top2TSSs$TSS[which(Top2TSSs$gene == genes[i])]) + 500
end[i] = start[i] - l
}
else{
start[i] = min(Top2TSSs$TSS[which(Top2TSSs$gene == genes[i])]) - 500
end[i] = start[i] + l
}
}
target.regions = data.frame(chr = chr,
strand = sapply(strand, function(x) if(x == 1){return("+")} else{return("-")}),
gene = genes,
start = start,
end = end)
target.regions
## chr strand gene start end
## 1 chr15 + Nov 54746402 54711402
## 2 chr12 + Esrrb 86470631 86435631
## 3 chr16 - Bcl6 23988107 24023107
## 4 chr7 - Etv2 30636029 30671029
## 5 chr11 - Lhx1 84525042 84560042
## 6 chr10 + Hsf2 57486918 57451918
## 7 chr3 + Sox2 34650524 34615524
## 8 chr9 + Isl2 55541690 55506690
## 9 chr10 + Aes 81560024 81525024
## 10 chr10 - Hey2 30842291 30877291
## 11 chr6 - Nobox 43309052 43344052
## 12 chr8 + Sox1 12396374 12361374
## 13 chr18 + Gata6 11053144 11018144
## 14 chr17 - Spdef 27728456 27763456
## 15 chr15 + Hoxc11 102955010 102920010
## 16 chr17 + Pou5f1 35506552 35471552
## 17 chr5 + Mlxip 123395282 123360282
## 18 chr6 + Atoh1 64729741 64694741
## 19 chr17 + Tfeb 47737674 47702674
## 20 chrX - Gata1 7967312 8002312
## 21 chr1 + Lmx1a 167690065 167655065
## 22 chr14 + Klf5 99299445 99264445
## 23 chr14 - Gata4 63244765 63279765
## 24 chr8 + Klf2 72319553 72284553
## 25 chr1 + Tfcp2l1 118628486 118593486
## 26 chr4 - Klf4 55531738 55566738
## 27 chr6 + Nanog 122708146 122673146
target.regions$start[which(target.regions$gene == "Esrrb")] = min(Top2TSSs$TSS[which(Top2TSSs$gene == "Esrrb")]) - l
target.regions$end[which(target.regions$gene == "Esrrb")] = max(Top2TSSs$TSS[which(Top2TSSs$gene == "Esrrb")])
target.regions$start[which(target.regions$gene == "Gata4")] = min(Top2TSSs$TSS[which(Top2TSSs$gene == "Gata4")])
target.regions$end[which(target.regions$gene == "Gata4")] = max(Top2TSSs$TSS[which(Top2TSSs$gene == "Gata4")]) + l
target.regions
## chr strand gene start end
## 1 chr15 + Nov 54746402 54711402
## 2 chr12 + Esrrb 86386628 86470131
## 3 chr16 - Bcl6 23988107 24023107
## 4 chr7 - Etv2 30636029 30671029
## 5 chr11 - Lhx1 84525042 84560042
## 6 chr10 + Hsf2 57486918 57451918
## 7 chr3 + Sox2 34650524 34615524
## 8 chr9 + Isl2 55541690 55506690
## 9 chr10 + Aes 81560024 81525024
## 10 chr10 - Hey2 30842291 30877291
## 11 chr6 - Nobox 43309052 43344052
## 12 chr8 + Sox1 12396374 12361374
## 13 chr18 + Gata6 11053144 11018144
## 14 chr17 - Spdef 27728456 27763456
## 15 chr15 + Hoxc11 102955010 102920010
## 16 chr17 + Pou5f1 35506552 35471552
## 17 chr5 + Mlxip 123395282 123360282
## 18 chr6 + Atoh1 64729741 64694741
## 19 chr17 + Tfeb 47737674 47702674
## 20 chrX - Gata1 7967312 8002312
## 21 chr1 + Lmx1a 167690065 167655065
## 22 chr14 + Klf5 99299445 99264445
## 23 chr14 - Gata4 63245265 63306679
## 24 chr8 + Klf2 72319553 72284553
## 25 chr1 + Tfcp2l1 118628486 118593486
## 26 chr4 - Klf4 55531738 55566738
## 27 chr6 + Nanog 122708146 122673146
target.regions = data.frame(chr = target.regions$chr, strand = target.regions$strand, gene = target.regions$gene, start = apply(target.regions[ ,c("start", "end")], 1, min), end = apply(target.regions[ ,c("start", "end")], 1, max))
target.regions
## chr strand gene start end
## 1 chr15 + Nov 54711402 54746402
## 2 chr12 + Esrrb 86386628 86470131
## 3 chr16 - Bcl6 23988107 24023107
## 4 chr7 - Etv2 30636029 30671029
## 5 chr11 - Lhx1 84525042 84560042
## 6 chr10 + Hsf2 57451918 57486918
## 7 chr3 + Sox2 34615524 34650524
## 8 chr9 + Isl2 55506690 55541690
## 9 chr10 + Aes 81525024 81560024
## 10 chr10 - Hey2 30842291 30877291
## 11 chr6 - Nobox 43309052 43344052
## 12 chr8 + Sox1 12361374 12396374
## 13 chr18 + Gata6 11018144 11053144
## 14 chr17 - Spdef 27728456 27763456
## 15 chr15 + Hoxc11 102920010 102955010
## 16 chr17 + Pou5f1 35471552 35506552
## 17 chr5 + Mlxip 123360282 123395282
## 18 chr6 + Atoh1 64694741 64729741
## 19 chr17 + Tfeb 47702674 47737674
## 20 chrX - Gata1 7967312 8002312
## 21 chr1 + Lmx1a 167655065 167690065
## 22 chr14 + Klf5 99264445 99299445
## 23 chr14 - Gata4 63245265 63306679
## 24 chr8 + Klf2 72284553 72319553
## 25 chr1 + Tfcp2l1 118593486 118628486
## 26 chr4 - Klf4 55531738 55566738
## 27 chr6 + Nanog 122673146 122708146
We’ll take a look at these in the genome browser.
Nov promoter
This promoter looks very safe to target.
Esrrb promoter
Safe to target.
Bcl6 promoter
Safe to target.
Etv2 promoter
This one is problematic, but not fully. Rbm42 and Haus5 run on the opposite strand, so we can probably target up to the TTS of these genes (30650317 and 30664994, respectively).
Lhx1 promoter
There is a lncRNA on the opposite strand (Lhx1os with TSS 84525660). If we target further upstream of this, we should be fine. We should also check the distribution of the guides to ensure that the effect is not due to Lhx1os.
Hsf2 promoter
Again, we have the same issue. We have to target further upstream to the TSS of 4930467K11Rik (57,486,351).
Sox2 promoter
Again a lncRNA is nearby, but this time on the same strand as the gene. We can target downstream of the TSS of Sox2ot (34,638,174) to ensure we are targetting Sox2.
Isl2 promoter
There is a gene in the opposite direction of Isl2, Etfa (TSS: 55,512,243). We will have to target about 5kb in the opposite direction to ensure we’re not targetting Etfa.
Aes promoter
Again, there’s a gene in the opposite direction, Gna11 (TSS: 81,545,190). We’ll have to do the same as above.
Hey2 promoter
This gene looks safe to target.
Nobox promoter
This gene looks safe to target.
Sox1 promoter
There is a predicted gene with a TSS upstream of the TSS, but I’m not sure how much we have to do about this.
Gata6 promoter
There are lncRNAs in the opposite direction that we may have to worry about. 1010001N08Rik with TSS at 11,052,567.
Spdef promoter
There is a gene nearby, but it runs in the same direction as Spdef, so I don’t think there’s much we have to worry about as long as we don’t target in the gene (D17Wsu92e with TSS 27,751,232).
Hoxc11 promoter
There a whole mess here. If we target Hoxc11, we might also target Hotair. I don’t know if we should even include this gene.
Pou5f1 promoter
The only gene nearby is Tcf19, but this runs in the opposite direction so I don’t think we have to worry about this.
Mlxip promoter
There is a gene running in the opposite direction Bcl7a with TTS 123,374,992.
Atoh1 promoter
This gene looks fine.
Tfeb promoter
This gene looks problematic. The promoter for Pgc (TSS: 47,734,482) essentially overlaps with Tfeb. This gene will have to be verified.
Gata1 promoter
This gene looks good.
Lmx1a promoter
This gene looks good.
Klf5 promoter
This gene looks good.
Gata4 promoter
This gene looks good.
Klf2 promoter
There is a pseudogene in the opposite direction that we may have to worry about. Gm10282 has a TSS at 72,305,260, so we will have to limit ourselves to about 5kb away from this.
Tfcp2l1 promoter
Again a problem here. We will have to limit ourselves to the TTS of Clasp1 (118,612,678).
Klf4 promoter
There is a predicted gene about 20Kb from the TSS of Klf4. We should be relatively safe, but to be sure we can target up to 5kb from the TSS of Gm12511 (55,563,265).
Nanog promoter
This gene is safe.
target.regions$start[which(target.regions$gene == "Lhx1")] = max(c(84525660 + 500, target.regions$start[which(target.regions$gene == "Lhx1")])) # avoid lncRNA on opposite strand
target.regions$end[which(target.regions$gene == "Etv2")] = min(c(target.regions$end[which(target.regions$gene == "Etv2")], 30650317 - 500, 30664994 - 500)) # avoid end of genes on same strand
target.regions$end[which(target.regions$gene == "Hsf2")] = min(c(target.regions$end[which(target.regions$gene == "Hsf2")], 57486351 - 500)) # avoid lncRNA on opposite strand
target.regions$start[which(target.regions$gene == "Sox2")] = max(c(target.regions$start[which(target.regions$gene == "Sox2")], 34638174 + 500)) # avoid lncRNA upstream on same strand
target.regions$start[which(target.regions$gene == "Isl2")] = target.regions$end[which(target.regions$gene == "Isl2")] - 5000 # avoid gene upstream on opposite strand
target.regions$start[which(target.regions$gene == "Aes")] = target.regions$end[which(target.regions$gene == "Aes")] - 5000 # avoid genes upstream on opposite strand
target.regions = target.regions[-which(target.regions$gene == "Gata6"), ] # Gata6 is a problem
target.regions$end[which(target.regions$gene == "Spdef")] = min(c(target.regions$end[which(target.regions$gene == "Spdef")], 27751232 - 500)) #there is predicted gene upstream of Spdef
target.regions = target.regions[-which(target.regions$gene == "Hoxc11"), ] # Hoxc11 is a problem
target.regions$start[which(target.regions$gene == "Mlxip")] = max(c(target.regions$start[which(target.regions$gene == "Mlxip")], 123374992 + 500)) # gene upstream of Mlxip
target.regions = target.regions[-which(target.regions$gene == "Tfeb"), ] # Tfeb is a problem
target.regions$start[which(target.regions$gene == "Klf2")] = max(c(target.regions$start[which(target.regions$gene == "Klf2")], 72305260 + 5000))
target.regions$start[which(target.regions$gene == "Tfcp2l1")] = max(c(target.regions$start[which(target.regions$gene == "Tfcp2l1")], 118612678))
target.regions$end[which(target.regions$gene == "Klf4")] = min(c(target.regions$end[which(target.regions$gene == "Klf4")], 55563265 - 5000))
target.regions$gene = factor(target.regions$gene, levels = unique(target.regions$gene))
target.regions
## chr strand gene start end
## 1 chr15 + Nov 54711402 54746402
## 2 chr12 + Esrrb 86386628 86470131
## 3 chr16 - Bcl6 23988107 24023107
## 4 chr7 - Etv2 30636029 30649817
## 5 chr11 - Lhx1 84526160 84560042
## 6 chr10 + Hsf2 57451918 57485851
## 7 chr3 + Sox2 34638674 34650524
## 8 chr9 + Isl2 55536690 55541690
## 9 chr10 + Aes 81555024 81560024
## 10 chr10 - Hey2 30842291 30877291
## 11 chr6 - Nobox 43309052 43344052
## 12 chr8 + Sox1 12361374 12396374
## 14 chr17 - Spdef 27728456 27750732
## 16 chr17 + Pou5f1 35471552 35506552
## 17 chr5 + Mlxip 123375492 123395282
## 18 chr6 + Atoh1 64694741 64729741
## 20 chrX - Gata1 7967312 8002312
## 21 chr1 + Lmx1a 167655065 167690065
## 22 chr14 + Klf5 99264445 99299445
## 23 chr14 - Gata4 63245265 63306679
## 24 chr8 + Klf2 72310260 72319553
## 25 chr1 + Tfcp2l1 118612678 118628486
## 26 chr4 - Klf4 55531738 55558265
## 27 chr6 + Nanog 122673146 122708146
mouse_enhancer_mESC = read.table(file = "mouse_enhancer_mESC.txt", col.names = c("chr", "start", "end", "cell.Type", "strand"))
hist(mouse_enhancer_mESC$end - mouse_enhancer_mESC$start, breaks = 100)
library(GenomicRanges)
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
target.regions.gr = with(target.regions, GRanges(chr, IRanges(start, end), id=gene, strand = strand))
head(target.regions.gr)
## GRanges object with 6 ranges and 1 metadata column:
## seqnames ranges strand | id
## <Rle> <IRanges> <Rle> | <factor>
## [1] chr15 [54711402, 54746402] + | Nov
## [2] chr12 [86386628, 86470131] + | Esrrb
## [3] chr16 [23988107, 24023107] - | Bcl6
## [4] chr7 [30636029, 30649817] - | Etv2
## [5] chr11 [84526160, 84560042] - | Lhx1
## [6] chr10 [57451918, 57485851] + | Hsf2
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
mouse_enhancer_mESC.gr = with(mouse_enhancer_mESC, GRanges(chr, IRanges(start, end)))
overlap = mergeByOverlaps(mouse_enhancer_mESC.gr, target.regions.gr)
length(overlap$id)
## [1] 27
head(overlap)
## DataFrame with 6 rows and 3 columns
## mouse_enhancer_mESC.gr target.regions.gr id
## <GRanges> <GRanges> <factor>
## 1 chr3:34645278-34647278 chr3:34638674-34650524:+ Sox2
## 2 chr6:64693906-64695906 chr6:64694741-64729741:+ Atoh1
## 3 chr6:122691182-122693182 chr6:122673146-122708146:+ Nanog
## 4 chr6:122692682-122694682 chr6:122673146-122708146:+ Nanog
## 5 chr8:12389800-12391800 chr8:12361374-12396374:+ Sox1
## 6 chr8:72309501-72311501 chr8:72310260-72319553:+ Klf2
overlap$id
## [1] Sox2 Atoh1 Nanog Nanog Sox1 Klf2 Klf2 Esrrb Esrrb Esrrb
## [11] Esrrb Esrrb Esrrb Esrrb Esrrb Esrrb Klf5 Klf5 Klf5 Klf5
## [21] Bcl6 Spdef Spdef Spdef Spdef Pou5f1 Gata1
## 24 Levels: Nov Esrrb Bcl6 Etv2 Lhx1 Hsf2 Sox2 Isl2 Aes Hey2 Nobox ... Nanog
#sapply(overlap, width)
library(Biostrings)
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(BSgenome.Mmusculus.UCSC.mm10)
## Loading required package: BSgenome
## Loading required package: rtracklayer
mm10 = BSgenome.Mmusculus.UCSC.mm10
target.seqs = getSeq(mm10, target.regions.gr)
head(target.seqs)
## A DNAStringSet instance of length 6
## width seq
## [1] 35001 CCAAAAACCCATAGATGGGAGGTCATATGTT...CAACAACCAGACTGGCATTTGCATGGGTAAT
## [2] 83504 GGTATGGGAGTGACCCCATTAATAAAGGCGT...CCTCTCACCTCACTTAGAACTGACTCCCGCG
## [3] 35001 AGTTGTTGTGCAGCAATTCTTCTTCCTTTAC...TGGGTTGCGCGGCCTGGGAGGGCGGACCGCG
## [4] 13789 GCGGGGAGGCATTAGCTGGGATGAAGGCGCG...AGGAAGGACAGGGGAACACCAAGATGCAGGG
## [5] 33883 CATGTCTGCCTTCAATTTTGTTACACACATT...CGGGCCTCCGGTTTCTCCCGCCCCCATCAGG
## [6] 33934 TATACAGACACTAAGAGAACACAAATTCCAG...TGCTTCAGGTTTTGATAAAACTGTCCTTAAA
library(seqinr);
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
wanted_seqs = list(genes = target.regions$gene,
seqs = sapply(target.seqs, toString),
chr = target.regions$chr,
start = target.regions$start,
end = target.regions$end,
strand = target.regions$strand)
write_seqs <- function(seqs, gene_names, chrom, start_pos, end_pos, strand, filename){
stopifnot(dim(seqs)[1] == length(gene_names))
write.fasta(file.out = filename, sequences = seqs[1], names = paste0(chrom[1], "\t", start_pos[1], "\t", end_pos[1], "\t", gene_names[1], "\t", strand[1]), open = "w", nbchar = 80, as.string = TRUE)
if(length(gene_names) > 1){
for(i in 2:length(gene_names)){
write.fasta(file.out = filename, sequences = seqs[i], names = paste0(chrom[i], "\t", start_pos[i], "\t", end_pos[i], "\t", gene_names[i], "\t", strand[1]), open = "a", nbchar = 80, as.string = TRUE)
}
}
}
write_seqs(wanted_seqs$seqs, wanted_seqs$genes, wanted_seqs$chr, round(wanted_seqs$start), round(wanted_seqs$end), wanted_seqs$strand, "tiling_screen_promoters.fa")
write.table(wanted_seqs$genes, file = "genes.txt", sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE)
Xueqiu found some regions that we will want to include, so I will concatenate all of the fastq files to ensure that they don’t overlap.
cat top27escGene_30Kpromoters_mm10_modified100Kb_Klf4Gata6Nanog_tab.fa top27escGene_TSS_FANTOM5_top2_25geneTAD_mm10_Anchors_merged_uniq_tab.fa top27escGene_TSS_FANTOM5_top2_25geneTAD_mm10_RNAPII_Anchors_merged_uniq_tab.fa top27escGene_TSS_FANTOM5_top2_mm10_enhancers_noPromoters_tab.fa tiling_screen_promoters.fa > tiling_screen.fa
grep chr tiling_screen.fa | cut -c 2- | cut -f 1-4 > tiling_regions.txt
tiling_regions = read.table(file = "tiling_regions.txt", sep = "\t", col.names = c("chr", "start", "end", "gene"), colClasses = c("factor", "integer", "integer", "factor"))
tiling_regions$start = as.numeric(tiling_regions$start)
tiling_regions$end = as.numeric(tiling_regions$end)
library(GenomicRanges)
tiling_regions.gr = with(tiling_regions, GRanges(chr, IRanges(start, end), id=gene))
overlaps = countOverlaps(tiling_regions.gr, tiling_regions.gr)
~/sgRNA/sgRNAdesign/propose_sgRNAs -i tiling_screen_promoters.fa -V -R -c ~/sgRNA/Meng/guide\ design/enzyme_cutting_seqs.txt -o tiling_guides.fa
while read gene; do
n_lines="$(grep ${gene} tiling_guides.txt | wc -l)";
printf "%s\t%s\n" "${gene}" "${n_lines}";
done < genes.txt
wc -l tiling_guides.fa
head tiling_guides.fa